# R script for constructing key measures for replicating all analysis
# in 'When Parties Move to the Middle: The Role of Uncertainty'
# by J. Lindvall, D. Rueda, and H. Zhai
# this file written by: H Zhai (2022-11-03 [updated: -])
# on device: Mac Pro 13 Dual-Core Intel Core i5 2.3 GHz 

# PLEASE MAKE SURE ALL THE REPLICATION FILES (DATA AND SCRIPTS) ARE STORED AT THE SAME LEVEL IN THE SAME DIRECTORY
# OR MAKE SURE THE DIRECTORY-RELATED CODES ARE PROPERLY ADJUSTED 
# TO ENSURE THE CODES RUN WITHOUT DIRECTORY-RELATED PROBLEMS
# RESTART R SESSION BEFORE RUNNING

# This file cleans the raw data and makes the measures used for main and supplementary analyses

# BEGIN SCRIPT
rm(list = ls())

# pkgs --------------------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("scales")) install.packages("scalse")
if (!require("haven")) install.packages("haven")

# manifesto data ----------------------------------------------------------

## read-in raw data
manifesto <- haven::read_dta("MPDataset_MPDS2019b_stata14.dta") # raw marpor data

## add election years
manifesto$eyear <- as.numeric(substring(manifesto$edate,1,4)) # add year

## subset to OCED only
manifesto_oecd <- group_by(manifesto, country) %>%
  filter(max(oecdmember) == 10) %>% # oecd at least last
  ungroup() 
unique(paste0(manifesto_oecd$countryname, "-", manifesto_oecd$eyear)) # check country-year

## save data
save(manifesto_oecd, file = "RData_manifesto_oecd.RData")

## clean environment
rm(list = ls())

# measure: policy scales --------------------------------------------------

## load raw data
load("RData_manifesto_oecd.RData")

## two ways: simple difference (main), log ratio (check)
## larger value = more rightist position

## items for lr right/left mentions
colname_r <- c('per104', 'per201', 
               'per203', 'per305', 
               'per401', 'per402', 
               'per407', 'per414', 
               'per505', 'per601', 
               'per603', 'per605', 
               'per606') # all right rile
colname_l <- c('per103','per105',
               'per106','per107',
               'per403','per404',
               'per406','per412',
               'per413','per504',
               'per506','per701',
               'per202') # all left rile

## simple dif
leftright <- manifesto_oecd$rile # simple dif
leftright <- scales::rescale_mid(leftright, to = c(0, 100), mid = 0) # rescale to 0-100, keep 0 midpoint

## log ratio
leftright_log <- log( (rowSums(manifesto_oecd[colname_r]) + .5) /
                        (rowSums(manifesto_oecd[colname_l]) + .5) ) # log odds
leftright_log <- scales::rescale_mid(leftright_log, to = c(0, 100), mid = 0) # rescale to 0-100, keep 0 midpoint

## check correlation
cor.test(leftright, leftright_log) # r=0.93, p<0.001

## save data
save(leftright, leftright_log, file = "RData_measure_scale.RData")

## clean environment
rm(list = ls())

# measure: party variables ------------------------------------------------

## load raw data
load("RData_manifesto_oecd.RData")
load("RData_measure_scale.RData")

## identify main lr parties
manifesto_mlr <- manifesto_oecd %>% 
  group_by(country, edate) %>% 
  mutate(party_mlr = case_when(pervote == max(pervote[parfam %in% c(20, 30)], na.rm = T)  ~ "main left", # main left = largest socialists/social democrats
                               pervote == max(pervote[parfam %in% c(50, 60)], na.rm = T) ~ "main right", TRUE ~ "other")) %>%  # main right = largest christian democrats/conservatives
  ungroup() %>% 
  select(country, countryname, eyear, edate, party, partyname, parfam, party_mlr, pervote) # select vars

party_main <- manifesto_mlr %>% 
  filter(party_mlr!="other") %>% # drop non-main parties
  group_by(country, countryname, partyname) %>% 
  summarise(parfam = unique(parfam), party_mlr = unique(party_mlr), 
            pervote = mean(pervote)) %>% 
  arrange(countryname, party_mlr, partyname) # sort by alphabetical order

## construct measures
measure_party <- manifesto_mlr %>% 
  select(eyear, edate, country, countryname, party_mlr) %>% # election id 
  cbind.data.frame(leftright, leftright_log) %>% 
  filter(!is.na(party_mlr)) %>% 
  group_by(country, countryname, eyear, edate) %>% # group by election
  summarise( 
    # policy position
    leftright_ml = mean(leftright[party_mlr=="main left"]),
    leftright_log_ml = mean(leftright_log[party_mlr=="main left"]), # main left 
    leftright_mr = mean(leftright[party_mlr=="main right"]),
    leftright_log_mr = mean(leftright_log[party_mlr=="main right"])) %>% # main right 
  mutate( 
    # policy midpoint
    leftright_mid = (leftright_ml + leftright_mr)/2,
    leftright_log_mid = (leftright_log_ml + leftright_log_mr)/2,
    # policy distance
    ## left-right distance
    leftright_dif = abs(leftright_ml - leftright_mr),
    leftright_log_dif = abs(leftright_log_ml - leftright_log_mr)) %>% ungroup()

## sanity check
summary(measure_party)

## save data
save(measure_party, file = "RData_measure_party.RData")
#save(party_main, file = "RData_party_main.RData") # save party list
save(manifesto_mlr, file = "RData_manifesto_mlr.RData") 

## clean env.
rm(list = ls())

# measure: voter variables ------------------------------------------------

## load raw data
load("RData_manifesto_oecd.RData")
load("RData_measure_scale.RData")

## prepare data
measure_data <- select(manifesto_oecd, country, countryname, edate, eyear, pervote) %>% # select election vars
  cbind.data.frame(leftright, leftright_log) %>% # add policy scales 
  drop_na() # remove na
#save(measure_data, file = "RData_measure_oecd_scale.RData") # save for other use

rm(list = ls(pattern = "leftright|welfare|manifesto")) # remove some raw data (name conflict)

## define helper
medvoter <- function(measure_var, var) {
  # function for group frequency distribution parameters (median + median class frequency density)
  ## adapted from: https://stackoverflow.com/questions/18887382/how-to-calculate-the-median-on-grouped-dataset
  ## user A5C1D2H2I1M1N2O1R2T1's open-source contribution (2013-09-22)
  gmed <- function(x, f, out = NULL) { # saved in separate file
    low <- min(x) # lower boundary of distribution
    up <- max(x) # upper boundary of distribution
    mid <- zoo::rollmean(x, k = 2) # get interval boundaries 
    while (isTRUE(all.equal(max(mid), up))) {
      mid <- mid[1:length(mid)-1] # drop duplicate boundaries
    }
    breaks <- unique(c(low, mid, up)) # boundaries
    int <- cut(x, breaks = breaks, include.lowest = T) # cut into intervals
    int <- sapply(strsplit(gsub("\\[|\\]|\\(|\\)", "", int), ","), as.numeric) # trim & convert interval to numerical
    cf <- cumsum(f) # cumulative frequency
    midrow <- findInterval(max(cf)/2, cf) + 1 # median class
    l <- int[1, midrow] # lower class boundary of median class
    h <- diff(int[, midrow]) # width of median class
    f2 <- f[midrow] # frequency of median class
    cf2 <- cf[max(midrow - 1, 1)] # cumulative frequency up to median class
    n_2 <- max(cf)/2 # total frequency divided by 2
    # return output as required
    if (!is.null(out)) {
      if (out == "med") unname(l + (n_2 - cf2)/f2 * h) # return median
      else unname(f2 / (h*max(cf))) # return class density
    } 
  } 
  # apply to country-election sets
  medpos <- paste0(var, "_med") # varname for position
  medden <- paste0(var, "_den") # varname for density
  measure_var %>%
    group_by(country, countryname, eyear, edate) %>% # group by election ids
    filter(n() > 1) %>% # drop one-party elections  
    arrange(!!sym(var)) %>% # sort by position
    nest() %>% # nest
    mutate(pos = map(.x = data, .f = ~gmed(.x$UQ(var), .x$pervote, out = "med")), # median
           den = map(.x = data, .f = ~gmed(.x$UQ(var), .x$pervote, out = "den"))) %>% # median class density
    unnest(c(pos, den)) %>% # unnest results
    ungroup() %>% # drop grouping 
    rename(!! medpos := pos, !! medden := den) %>% # rename results
    select(-data) # drop nested data column
} 

## construct measures

### simple difference
medvoter_lr <- medvoter(measure_data, "leftright")
hist(medvoter_lr$leftright_den) # right-skewed 
medvoter_lr$leftright_den_log <- log(medvoter_lr$leftright_den) # log density (note no zero so no +1 correction needed)
medvoter_lr$leftright_den_rlog <- scales::rescale(medvoter_lr$leftright_den_log, to = c(0,100)) # rescale density to 0-100
hist(medvoter_lr$leftright_den_rlog) # normal

### log ratio
medvoter_lr_log <- medvoter(measure_data, "leftright_log")
hist(medvoter_lr_log$leftright_log_den) # right-skewed 
medvoter_lr_log$leftright_log_den_log <- log(medvoter_lr_log$leftright_log_den) # log density
medvoter_lr_log$leftright_log_den_rlog <- scales::rescale(medvoter_lr_log$leftright_log_den_log, to = c(0,100)) # rescale density to 0-100
hist(medvoter_lr_log$leftright_log_den_rlog) # normal

## collect measures
measure_voter <- full_join(medvoter_lr, medvoter_lr_log) 

## sanity check
summary(measure_voter) 

## save data
save(measure_voter, file = "RData_measure_voter.RData")

## clean env.
rm(list = ls())

# measure: controls - manifesto -------------------------------------------

## load raw data
load("RData_manifesto_mlr.RData") # marpor main lr data
load("RData_measure_scale.RData") # policy scale data

## construct measures

### left/right competitors
measure_marpor_comp <- manifesto_mlr %>% 
  select(-partyname, -pervote) %>% # party parameters only
  cbind.data.frame(leftright, leftright_log) %>% 
  group_by(country, countryname, eyear, edate) %>% 
  summarise(compl_leftright = max(ifelse(leftright < leftright[party_mlr == "main left"], 1, 0)), # left competitor to main left 
            compr_leftright = max(ifelse(leftright > leftright[party_mlr == "main right"], 1, 0)), # right competitor to main right
            compl_leftright_log = max(ifelse(leftright_log < leftright_log[party_mlr == "main left"], 1, 0)), # (log) left competitor to ml 
            compr_leftright_log = max(ifelse(leftright_log > leftright_log[party_mlr == "main right"], 1, 0))) %>% 
  mutate_at(.vars = vars(matches("comp")), 
            .funs = ~ifelse(is.na(.) | . == -Inf, 0, .) %>% as.factor(.)) # replace na by 0 & make dummy

### polarization (Dalton 2008)
measure_marpor_pol <- manifesto_mlr %>%  
  select(country, countryname, eyear, edate, pervote) %>% # election parameters
  cbind.data.frame(leftright, leftright_log) %>% 
  group_by(country, countryname, eyear, edate) %>% 
  summarise_at(.vars = vars(matches("leftright")),
               .funs = list(pl = ~ sqrt((sum(pervote *(. - mean(., na.rm = T))/50)^2)))) %>% # apply formula
  group_by(countryname) %>% # regroup by country only
  mutate_at(.vars = vars(matches("_pl")),
            .funs = list(lag = ~dplyr::lag(., 1))) %>% # lag by 1 election
  select(-ends_with("_pl")) %>% # drop non-lag 
  ungroup()

## collect measures
measure_marpor <- full_join(measure_marpor_comp, measure_marpor_pol)

## sanity check
summary(measure_marpor) 

## save data
save(measure_marpor, file = "RData_measure_marpor.RData")

## clean env.
rm(list = ls())

# measure: controls - CPDS ------------------------------------------------

## load raw data
cpds <- haven::read_dta("CPDS-1960-2017-Update-2019.dta") # raw cpds data
cpds$country[cpds$country=="USA"] <- "United States" # correct usa country name

## construct measures
measure_cpds <-
  cpds %>%
  select(year, country, # id vars
         gov_party, effpar_ele, vturn, ud_ipol, 
         openc, realgdpgr, unemp
  ) %>% 
  # gov party (higher=leftist), eff. party n, voter turnout, union density (intd), trade (% gdp), gdp growth, unemployment (%)
  mutate(gov_party = 100 - gov_party) %>% # gov party reversed (higher=rightist)
  rename(countryname = country, eyear = year) %>%
  group_by(countryname) %>%
  mutate_at(.vars = vars(gov_party, ud_ipol, openc, realgdpgr, unemp),
            .funs = list(y3 = ~zoo::rollmean(., 5, align = "right", fill = NA)) # 5-year average for long-term vars
  ) %>% 
  mutate(effpar_ele_lag = dplyr::lag(effpar_ele, 1), vturn_lag = dplyr::lag(vturn, 1)) %>% # lag eff. party no. & turnout
  ungroup() %>%
  select(countryname, eyear, effpar_ele_lag, vturn_lag, gov_party_y3:last_col()) # select vars

## sanity check
summary(measure_cpds)

## save data
save(measure_cpds, file = "RData_measure_cpds.RData")

## clean env.
rm(list = ls())

# END SCRIPT 